This Jupyter notebook is intented to be used alongside the book Python for Bioinformatics
Note: Before opening the file, this file should be accesible from this Jupyter notebook. In order to do so, the following commands will download these files from Github and extract them into a directory called samples.
In [ ]:
!curl https://raw.githubusercontent.com/Serulab/Py4Bio/master/samples/samples.tar.bz2 -o samples.tar.bz2
!mkdir samples
!tar xvfj samples.tar.bz2 -C samples
In [ ]:
import re
In [ ]:
mo = re.search('hello', 'Hello world, hello Python!')
In [ ]:
mo.group()
Out[ ]:
In [ ]:
mo.span()
Out[ ]:
In [ ]:
'Hello world, hello Python!'.index('hello')
Out[ ]:
In [ ]:
import re
In [ ]:
mo = re.search('[Hh]ello', 'Hello world, hello Python!')
In [ ]:
mo.group()
Out[ ]:
In [ ]:
re.findall("[Hh]ello","Hello world, hello Python,!")
Out[ ]:
In [ ]:
re.finditer("[Hh]ello", "Hello world, hello Python,!")
Out[ ]:
In [ ]:
mos = re.finditer("[Hh]ello", "Hello world, hello Python,!")
In [ ]:
for x in mos:
print(x.group())
print(x.span())
In [ ]:
mo = re.match("hello", "Hello world, hello Python!")
print (mo)
In [ ]:
mo = re.match("Hello", "Hello world, hello Python!")
mo
Out[ ]:
In [ ]:
mo.group()
Out[ ]:
In [ ]:
mo.span()
Out[ ]:
In [ ]:
re.findall("[Hh]ello","Hello world, hello Python,!")
Out[ ]:
In [ ]:
rgx = re.compile("[Hh]ello")
rgx.findall("Hello world, hello Python,!")
Out[ ]:
In [ ]:
rgx = re.compile("[Hh]ello")
rgx.search("Hello world, hello Python,!")
Out[ ]:
In [ ]:
rgx.match("Hello world, hello Python,!")
Out[ ]:
In [ ]:
rgx.findall("Hello world, hello Python,!")
Out[ ]:
Listing 13.1: findTAT.py: Find the first “TAT” repeat
In [ ]:
import re
seq = "ATATAAGATGCGCGCGCTTATGCGCGCA"
rgx = re.compile("TAT")
i = 1
for mo in rgx.finditer(seq):
print('Ocurrence {0}: {1}'.format(i, mo.group()))
print('Position: From {0} to {1}'.format(mo.start(),
mo.end()))
i += 1
In [ ]:
import re
seq = "ATATAAGATGCGCGCGCTTATGCGCGCA"
rgx = re.compile("(GC){3,}")
result = rgx.search(seq)
result.group()
Out[ ]:
In [ ]:
result.groups()
Out[ ]:
In [ ]:
rgx = re.compile("((GC){3,})")
result = rgx.search(seq)
result.groups()
Out[ ]:
In [ ]:
# Only the inner group is non-capturing
rgx = re.compile("((?:GC){3,})")
result = rgx.search(seq)
result.groups()
Out[ ]:
In [ ]:
rgx = re.compile("TAT") # No group at all.
rgx.findall(seq) # This returns a list of matching strings.
Out[ ]:
In [ ]:
rgx = re.compile("(GC){3,}") # One group. Return a list
rgx.findall(seq) # with the group for each match.
Out[ ]:
In [ ]:
rgx = re.compile("((GC){3,})") # Two groups. Return a
rgx.findall(seq) # list with tuples for each match.
Out[ ]:
In [ ]:
rgx = re.compile("((?:GC){3,})") # Using a non-capturing
rgx.findall(seq) # group to get only the matches.
Out[ ]:
Listing 13.2: subgroups.py: Find multiple sub-patterns
In [ ]:
import re
rgx = re.compile("(?P<TBX>TATA..).*(?P<CGislands>(?:GC){3,})")
seq = "ATATAAGATGCGCGCGCTTATGCGCGCA"
result = rgx.search(seq)
print(result.group('CGislands'))
print(result.group('TBX'))
Listing 13.3: regexsys1.py: Count lines with a user-supplied pattern on it
In [ ]:
import re, sys
myregex = re.compile(sys.argv[2])
counter = 0
with open(sys.argv[1]) as fh:
for line in fh:
if myregex.search(line):
counter += 1
print(counter)
Listing 13.4: countinfile.py: Count the occurrences of a pattern in a file
In [ ]:
import re, sys
myregex = re.compile(sys.argv[2])
i = 0
with open(sys.argv[1]) as fh:
for line in fh:
i += len(myregex.findall(line))
print(i)
Listing 13.5: deletegc.py: Delete GC repeats (more than 3 GC in a row)
In [ ]:
import re
regex = re.compile("(?:GC){3,}")
seq="ATGATCGTACTGCGCGCTTCATGTGATGCGCGCGCGCAGACTATAAG"
print ("Before:",seq)
print ("After:",regex.sub("",seq))
Listing 13.6: searchinfasta.py: Search a pattern in a FASTA file
In [ ]:
import re
pattern = "[LIVM]{2}.RL[DE].{4}RLE"
with open('samples/Q5R5X8.fas') as fh:
fh.readline() # Discard the first line.
seq = ""
for line in fh:
seq += line.strip()
rgx = re.compile(pattern)
result = rgx.search(seq)
patternfound = result.group()
span = result.span()
leftpos = span[0]-10
if leftpos<0:
leftpos = 0
print(seq[leftpos:span[0]].lower() + patternfound +
seq[span[1]:span[1]+10].lower())
Listing 13.7: cleanseq.py: Cleans a DNA sequence
In [ ]:
import re
regex = re.compile(' |\d|\n|\t')
seq = ''
for line in open('samples/pMOSBlue.txt'):
seq += regex.sub('',line)
print (seq)